R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

spotify <- read.csv("Spotify_top50.csv")
str(spotify)
## 'data.frame':    50 obs. of  14 variables:
##  $ X               : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Track.Name      : chr  "Se\xf1orita" "China" "boyfriend (with Social House)" "Beautiful People (feat. Khalid)" ...
##  $ Artist.Name     : chr  "Shawn Mendes" "Anuel AA" "Ariana Grande" "Ed Sheeran" ...
##  $ Genre           : chr  "canadian pop" "reggaeton flow" "dance pop" "pop" ...
##  $ Beats.Per.Minute: int  117 105 190 93 150 102 180 111 136 135 ...
##  $ Energy          : int  55 81 80 65 65 68 64 68 62 43 ...
##  $ Danceability    : int  76 79 40 64 58 80 75 48 88 70 ...
##  $ Loudness..dB..  : int  -6 -4 -4 -8 -4 -5 -6 -5 -6 -11 ...
##  $ Liveness        : int  8 8 16 8 11 9 7 8 11 10 ...
##  $ Valence.        : int  75 61 70 55 18 84 23 35 64 56 ...
##  $ Length.         : int  191 302 186 198 175 220 131 202 157 194 ...
##  $ Acousticness..  : int  4 8 12 12 45 9 2 15 5 33 ...
##  $ Speechiness.    : int  3 9 46 19 7 4 29 9 10 38 ...
##  $ Popularity      : int  79 92 85 86 94 84 92 90 87 95 ...
library(tidyverse) # for data wrangling
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.8     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
spotify <- spotify %>% 
  select(-X)
colSums(is.na(spotify))
##       Track.Name      Artist.Name            Genre Beats.Per.Minute 
##                0                0                0                0 
##           Energy     Danceability   Loudness..dB..         Liveness 
##                0                0                0                0 
##         Valence.          Length.   Acousticness..     Speechiness. 
##                0                0                0                0 
##       Popularity 
##                0
library(plotly) # for interactive plot
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(glue) # for glue text

top10_song <- spotify %>% 
  arrange(desc(Popularity)) %>% 
  head(10) %>% 
  select(c(Track.Name, Artist.Name, Genre, Popularity, Length.)) %>% 
  mutate(mean_length = mean(Length.),
         text = glue(
    "Artist = {Artist.Name}
    Genre = {Genre}"
  ))

plot_top10_song <- ggplot(data = top10_song, aes(x = reorder(Track.Name, Popularity),
                                                 y = Popularity,
                                                 text = text,
                                                 label = Popularity))+
  geom_col(aes(fill = Popularity), show.legend = F)+
  theme_bw()+
  coord_flip()+
  theme(axis.text = element_text(size = 12),
        axis.title = element_text(size = 12, colour = "black"),
        title = element_text(size = 12, colour = "black"))+
  geom_text(aes(label = Popularity), color = "white", size = 6, fontface = "bold", position = position_stack(0.8))+
  labs(title = "Top 10 Song on Spotify in 2022",
       x = "Song Title",
       y = "Popularity Rate",
       caption = "Source : Kaggle Dataset")

ggplotly(plot_top10_song, tooltip = "text")
top3_genre <- spotify %>% 
  group_by(Genre) %>% 
  summarise(song = n()) %>% 
  ungroup() %>% 
  mutate(song = song/50) %>% 
  arrange(desc(song)) %>% 
  head(3)

library(ggplot2) #to make plot
plot_top3_genre <- ggplot(data = top3_genre, aes(x = reorder(Genre, song),
                                                 y = song,
                                                 label = song))+
  geom_col(aes(fill = song), show.legend = FALSE)+
  theme_bw()+
  coord_flip()+
  theme(axis.text = element_text(size = 12),
        axis.title = element_text(size = 14, colour = "black"),
        title = element_text(size = 14, colour = "black"))+
  geom_text(aes(label = scales::percent(song)), color = "white", size = 12, fontface = "bold", position = position_stack(0.7))+
  labs(title = "Top 3 Genre of Spotify Most Popular Song 2022",
       x = "Genre of Music",
       y = "Rate of Genre",
       caption = "Source : Kaggle Dataset")

plot_top3_genre

spotify_ppt <- spotify %>% 
  select_if(is.numeric) %>% 
  select(-Popularity) # this variable would not be used even integer since it does not carelated to thid clasification.

glimpse(spotify_ppt)
## Rows: 50
## Columns: 9
## $ Beats.Per.Minute <int> 117, 105, 190, 93, 150, 102, 180, 111, 136, 135, 176,…
## $ Energy           <int> 55, 81, 80, 65, 65, 68, 64, 68, 62, 43, 62, 71, 41, 7…
## $ Danceability     <int> 76, 79, 40, 64, 58, 80, 75, 48, 88, 70, 61, 82, 50, 7…
## $ Loudness..dB..   <int> -6, -4, -4, -8, -4, -5, -6, -5, -6, -11, -5, -4, -6, …
## $ Liveness         <int> 8, 8, 16, 8, 11, 9, 7, 8, 11, 10, 24, 15, 11, 6, 12, …
## $ Valence.         <int> 75, 61, 70, 55, 18, 84, 23, 35, 64, 56, 24, 38, 45, 7…
## $ Length.          <int> 191, 302, 186, 198, 175, 220, 131, 202, 157, 194, 251…
## $ Acousticness..   <int> 4, 8, 12, 12, 45, 9, 2, 15, 5, 33, 60, 28, 75, 7, 10,…
## $ Speechiness.     <int> 3, 9, 46, 19, 7, 4, 29, 9, 10, 38, 31, 7, 3, 20, 5, 1…
spotify_scale <- scale(spotify_ppt, center = T, scale = T)
RNGkind(sample.kind = "Rounding")
## Warning in RNGkind(sample.kind = "Rounding"): non-uniform 'Rounding' sampler
## used
kmeansTunning <- function(data, maxK){
  withinall <-  NULL
  total_k <-  NULL
  for (i in 2: maxK){
    set.seed(101)
    temp <- kmeans(data,i)$tot.withinss
    withinall <- append(withinall, temp)
    total_k <-  append(total_k,i)
  }
  plot(x = total_k, y = withinall, type = "o", xlab = "Number of Cluster", ylab = "Total Within")
}

kmeansTunning(spotify_scale, maxK = 7)

set.seed(101)
spotify_cluster <- kmeans(spotify_ppt, 6)
spotify_ppt$cluster <- spotify_cluster$cluster
spotify_ppt$cluster <- as.factor(spotify_ppt$cluster)
library(FactoMineR) # for PCA
pca_spotify <- PCA(spotify_ppt, quali.sup =10, graph = F, scale.unit = T)

# plot
plot.PCA(pca_spotify, choix = "ind", label = "none", habillage = 10)

summary(pca_spotify)
## 
## Call:
## PCA(X = spotify_ppt, scale.unit = T, quali.sup = 10, graph = F) 
## 
## 
## Eigenvalues
##                        Dim.1   Dim.2   Dim.3   Dim.4   Dim.5   Dim.6   Dim.7
## Variance               2.252   1.578   1.273   1.015   0.898   0.732   0.692
## % of var.             25.020  17.532  14.144  11.282   9.982   8.139   7.691
## Cumulative % of var.  25.020  42.553  56.697  67.979  77.961  86.100  93.791
##                        Dim.8   Dim.9
## Variance               0.335   0.224
## % of var.              3.723   2.486
## Cumulative % of var.  97.514 100.000
## 
## Individuals (the 10 first)
##                      Dist    Dim.1    ctr   cos2    Dim.2    ctr   cos2  
## 1                |  1.886 |  0.154  0.021  0.007 | -0.310  0.122  0.027 |
## 2                |  3.269 |  2.085  3.860  0.407 |  0.189  0.045  0.003 |
## 3                |  4.937 | -0.002  0.000  0.000 |  4.103 21.336  0.691 |
## 4                |  1.874 | -0.689  0.422  0.135 | -0.050  0.003  0.001 |
## 5                |  2.816 | -0.725  0.467  0.066 |  0.039  0.002  0.000 |
## 6                |  2.102 |  1.293  1.485  0.378 | -0.314  0.125  0.022 |
## 7                |  3.624 | -1.589  2.242  0.192 |  2.276  6.565  0.394 |
## 8                |  2.364 | -0.052  0.002  0.000 | -0.098  0.012  0.002 |
## 9                |  2.182 | -0.066  0.004  0.001 |  0.394  0.196  0.033 |
## 10               |  3.905 | -3.226  9.242  0.682 |  1.025  1.333  0.069 |
##                   Dim.3    ctr   cos2  
## 1                -1.322  2.745  0.491 |
## 2                -0.135  0.028  0.002 |
## 3                 2.071  6.738  0.176 |
## 4                -0.159  0.040  0.007 |
## 5                 1.284  2.589  0.208 |
## 6                -1.315  2.718  0.392 |
## 7                -0.457  0.329  0.016 |
## 8                 1.211  2.303  0.262 |
## 9                -1.789  5.031  0.672 |
## 10               -0.063  0.006  0.000 |
## 
## Variables
##                     Dim.1    ctr   cos2    Dim.2    ctr   cos2    Dim.3    ctr
## Beats.Per.Minute | -0.231  2.375  0.053 |  0.840 44.735  0.706 |  0.082  0.522
## Energy           |  0.845 31.691  0.714 |  0.339  7.303  0.115 | -0.002  0.000
## Danceability     |  0.126  0.710  0.016 | -0.084  0.451  0.007 | -0.737 42.615
## Loudness..dB..   |  0.813 29.325  0.660 |  0.115  0.842  0.013 |  0.151  1.792
## Liveness         |  0.371  6.127  0.138 | -0.238  3.597  0.057 |  0.578 26.252
## Valence.         |  0.502 11.180  0.252 |  0.212  2.860  0.045 | -0.406 12.962
## Length.          |  0.359  5.718  0.129 |  0.009  0.006  0.000 |  0.355  9.896
## Acousticness..   | -0.362  5.832  0.131 | -0.277  4.868  0.077 |  0.211  3.505
## Speechiness.     | -0.398  7.042  0.159 |  0.747 35.339  0.558 |  0.177  2.456
##                    cos2  
## Beats.Per.Minute  0.007 |
## Energy            0.000 |
## Danceability      0.542 |
## Loudness..dB..    0.023 |
## Liveness          0.334 |
## Valence.          0.165 |
## Length.           0.126 |
## Acousticness..    0.045 |
## Speechiness.      0.031 |
## 
## Supplementary categories
##                      Dist    Dim.1   cos2 v.test    Dim.2   cos2 v.test  
## cluster_1        |  1.458 | -1.042  0.511 -3.032 |  0.434  0.089  1.510 |
## cluster_2        |  2.697 | -0.191  0.005 -0.297 |  2.412  0.800  4.480 |
## cluster_3        |  2.162 |  1.820  0.708  3.135 | -0.521  0.058 -1.072 |
## cluster_4        |  1.835 | -0.370  0.041 -0.808 | -1.136  0.383 -2.966 |
## cluster_5        |  1.285 |  0.140  0.012  0.387 | -0.553  0.186 -1.828 |
## cluster_6        |  3.224 |  2.047  0.403  2.412 |  0.802  0.062  1.129 |
##                   Dim.3   cos2 v.test  
## cluster_1        -0.532  0.133 -2.059 |
## cluster_2         1.016  0.142  2.100 |
## cluster_3         0.597  0.076  1.368 |
## cluster_4         0.886  0.233  2.575 |
## cluster_5        -0.704  0.300 -2.587 |
## cluster_6        -0.012  0.000 -0.018 |
plot.PCA(pca_spotify)

pca_dimdesc <-  dimdesc(pca_spotify)

pca_dimdesc$Dim.1
## $quanti
##                correlation      p.value
## Energy           0.8447698 1.244339e-14
## Loudness..dB..   0.8126227 7.751612e-13
## Valence.         0.5017403 2.055450e-04
## Liveness         0.3714510 7.910044e-03
## Length.          0.3588254 1.049896e-02
## Acousticness..  -0.3623999 9.700876e-03
## Speechiness.    -0.3982108 4.182600e-03
## 
## $quali
##                R2      p.value
## cluster 0.4379973 8.282538e-05
## 
## $category
##                    Estimate     p.value
## cluster=cluster_3  1.419174 0.001110847
## cluster=cluster_6  1.646350 0.014255483
## cluster=cluster_1 -1.442859 0.001677372
## 
## attr(,"class")
## [1] "condes" "list"
plot.PCA(pca_spotify, choix = "var", col.ind = spotify_ppt$cluster)

library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_cluster(spotify_cluster,
             data = spotify_ppt[,-10])+
  theme_minimal()

spotify_cluster$withinss
## [1] 28670.214  8828.800  6850.333 12348.444 18546.462  4674.667
spotify_cluster$totss
## [1] 193255
spotify_cluster$betweenss
## [1] 113336.1
spotify_cluster$betweenss/spotify_cluster$totss
## [1] 0.5864587
spotify_ppt %>% 
  group_by(cluster) %>% 
  summarise_all("mean")
## # A tibble: 6 × 10
##   cluster Beats…¹ Energy Dance…² Loudn…³ Liven…⁴ Valen…⁵ Length. Acous…⁶ Speec…⁷
##   <fct>     <dbl>  <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
## 1 1         142.    58      74.4   -7.07   12.3     51.2    167    19.7    14.8 
## 2 2         179.    70.4    63.6   -4.8    14       55.8    222.   25      29.8 
## 3 3          98.5   77      71.5   -4.5    29.8     69      227.   18.2     7.83
## 4 4          96.9   54.2    63.9   -5.89   18.8     29.9    213    14.9     6.89
## 5 5          99.3   65.4    74.8   -5.38    9.23    65.8    186.   33.5     9.46
## 6 6         124.    79.7    77.7   -3.33    7.67    65      300.    9.67   12   
## # … with abbreviated variable names ¹​Beats.Per.Minute, ²​Danceability,
## #   ³​Loudness..dB.., ⁴​Liveness, ⁵​Valence., ⁶​Acousticness.., ⁷​Speechiness.